Setup

Assign variables

date <- Sys.Date()

in_dir <- file.path("..", "results/integrated_dfs")

out_dir <- file.path("..", "results/expression_output")

options(dplyr.summarise.inform = FALSE)

Import files

d.DeKegel_top5_pairs_drug_sens_expr_annot <- read_rds(file.path(in_dir, "d.DeKegel_top5_pairs_drug_sens_expr_annot"))

Reformat DFs

## keep only relevant columns
d.DeKegel_top5_pairs_drug_sens_expr <- d.DeKegel_top5_pairs_drug_sens_expr_annot %>%
  dplyr::select(-c(entrez_pair:prediction_rank, prediction_score:family_size,
                   broad_id, primary_disease, subtype_disease))

## how many gene pairs are included in the paralog dataset?
d.DeKegel_top5_pairs_drug_sens_expr %>%
  distinct(sorted_gene_pair) %>%
  nrow()
## [1] 367

Import functions

source("shared_functions.R")

Paralog analysis

PRISM Phase 1 drug sensitivity

## add target col so single-gene functions will work
d.DeKegel_top5_pairs_drug_sens_expr <- d.DeKegel_top5_pairs_drug_sens_expr %>%
  mutate(target = sorted_gene_pair)

Continuous/linear regression

## A1 expr

## use broom::glance to get rsquared and pval
d.DeKegel_top5_pairs_drug_sens_A1_expr_lm_summary <- d.DeKegel_top5_pairs_drug_sens_expr %>%
  filter(!is.na(dependency)) %>%
  filter(!is.na(A1_log_tpm)) %>%
  group_by(target, compound_id) %>%
  group_modify(~ broom::glance(lm(dependency ~ A1_log_tpm, data = .x))) %>%
  dplyr::ungroup() %>%
  rename(r_squared = r.squared, p_val = p.value)

## FDR-adjust p-vals for multiple testing and add back to summary DF
d.stats <- BH_adjust(d.DeKegel_top5_pairs_drug_sens_A1_expr_lm_summary)

d.DeKegel_top5_pairs_drug_sens_A1_expr_lm_summary <- d.stats %>%
  dplyr::select(-p_val) %>%
  right_join(d.DeKegel_top5_pairs_drug_sens_A1_expr_lm_summary, by = c("target", "compound_id")) %>%
  dplyr::select(target, compound_id, r_squared, p_val, fdr)

## use broom::tidy to get slope
## note: group_modify expects a DF output
d.DeKegel_top5_pairs_drug_sens_A1_expr_lm_slope <- d.DeKegel_top5_pairs_drug_sens_expr %>%
  filter(!is.na(dependency)) %>%
  filter(!is.na(A1_log_tpm)) %>%
  group_by(target, compound_id) %>%
  group_modify(~ broom::tidy(lm(dependency ~ A1_log_tpm, data = .x))) %>%
  dplyr::ungroup() %>%
  ## extract slope of lm fit line from output
  filter(term == "A1_log_tpm") %>%
  rename(slope = estimate) %>%
  dplyr::select(target, compound_id, slope) 

## add rsquared, p-val, fdr, and slope info to main DF
d.DeKegel_top5_pairs_drug_sens_A1_expr_lm <- d.DeKegel_top5_pairs_drug_sens_expr %>%
  left_join(d.DeKegel_top5_pairs_drug_sens_A1_expr_lm_summary, by = c("target", "compound_id")) %>%
  left_join(d.DeKegel_top5_pairs_drug_sens_A1_expr_lm_slope, by = c("target", "compound_id")) %>%
  filter(!is.na(dependency)) %>%
  filter(!is.na(A1_log_tpm))

summarize_stats(d.DeKegel_top5_pairs_drug_sens_A1_expr_lm)
## df: d.DeKegel_top5_pairs_drug_sens_A1_expr_lm 
## # target/compound pairs with
## p < 0.05: 481 
## p < 0.01: 249 
## FDR < 0.1: 251 
## FDR < 0.05: 142
## A2 expr

## use broom::glance to get rsquared and pval
d.DeKegel_top5_pairs_drug_sens_A2_expr_lm_summary <- d.DeKegel_top5_pairs_drug_sens_expr %>%
  filter(!is.na(dependency)) %>%
  filter(!is.na(A2_log_tpm)) %>%
  group_by(target, compound_id) %>%
  group_modify(~ broom::glance(lm(dependency ~ A2_log_tpm, data = .x))) %>%
  dplyr::ungroup() %>%
  rename(r_squared = r.squared, p_val = p.value)

## FDR-adjust p-vals for multiple testing and add back to summary DF
d.stats <- BH_adjust(d.DeKegel_top5_pairs_drug_sens_A2_expr_lm_summary)

d.DeKegel_top5_pairs_drug_sens_A2_expr_lm_summary <- d.stats %>%
  dplyr::select(-p_val) %>%
  right_join(d.DeKegel_top5_pairs_drug_sens_A2_expr_lm_summary, by = c("target", "compound_id")) %>%
  dplyr::select(target, compound_id, r_squared, p_val, fdr)

## use broom::tidy to get slope
## note: group_modify expects a DF output
d.DeKegel_top5_pairs_drug_sens_A2_expr_lm_slope <- d.DeKegel_top5_pairs_drug_sens_expr %>%
  filter(!is.na(dependency)) %>%
  filter(!is.na(A2_log_tpm)) %>%
  group_by(target, compound_id) %>%
  group_modify(~ broom::tidy(lm(dependency ~ A2_log_tpm, data = .x))) %>%
  dplyr::ungroup() %>%
  ## extract slope of lm fit line from output
  filter(term == "A2_log_tpm") %>%
  rename(slope = estimate) %>%
  dplyr::select(target, compound_id, slope) 

## add rsquared, p-val, fdr, and slope info to main DF
d.DeKegel_top5_pairs_drug_sens_A2_expr_lm <- d.DeKegel_top5_pairs_drug_sens_expr %>%
  left_join(d.DeKegel_top5_pairs_drug_sens_A2_expr_lm_summary, by = c("target", "compound_id")) %>%
  left_join(d.DeKegel_top5_pairs_drug_sens_A2_expr_lm_slope, by = c("target", "compound_id")) %>%
  filter(!is.na(dependency)) %>%
  filter(!is.na(A2_log_tpm))

summarize_stats(d.DeKegel_top5_pairs_drug_sens_A2_expr_lm)
## df: d.DeKegel_top5_pairs_drug_sens_A2_expr_lm 
## # target/compound pairs with
## p < 0.05: 340 
## p < 0.01: 156 
## FDR < 0.1: 128 
## FDR < 0.05: 74
d.DeKegel_top5_pairs_drug_sens_A1_expr_lm %>%
  distinct(target, compound_id, .keep_all = TRUE) %>%
  mutate(slope_flag = ifelse(slope < 0, "negative", "positive")) %>%
  group_by(slope_flag) %>%
  summarize(n = n()) %>%
  print_kbl()
slope_flag n
negative 1343
positive 1039
d.DeKegel_top5_pairs_drug_sens_A2_expr_lm %>%
  distinct(target, compound_id, .keep_all = TRUE) %>%
  mutate(slope_flag = ifelse(slope < 0, "negative", "positive")) %>%
  group_by(slope_flag) %>%
  summarize(n = n()) %>%
  print_kbl()
slope_flag n
negative 1222
positive 1160
## get low expression = low LFC pairs (positive slope)
d.DeKegel_top5_pairs_drug_sens_A1_expr_lm_pos_slope <-d.DeKegel_top5_pairs_drug_sens_A1_expr_lm %>%
  filter(slope > 0)
summarize_stats(d.DeKegel_top5_pairs_drug_sens_A1_expr_lm_pos_slope)
## df: d.DeKegel_top5_pairs_drug_sens_A1_expr_lm_pos_slope 
## # target/compound pairs with
## p < 0.05: 120 
## p < 0.01: 48 
## FDR < 0.1: 48 
## FDR < 0.05: 29
## get high expression = low LFC pairs (negative slope)
d.DeKegel_top5_pairs_drug_sens_A1_expr_lm_neg_slope <- d.DeKegel_top5_pairs_drug_sens_A1_expr_lm %>%
  filter(slope < 0)
summarize_stats(d.DeKegel_top5_pairs_drug_sens_A1_expr_lm_neg_slope)
## df: d.DeKegel_top5_pairs_drug_sens_A1_expr_lm_neg_slope 
## # target/compound pairs with
## p < 0.05: 361 
## p < 0.01: 201 
## FDR < 0.1: 203 
## FDR < 0.05: 113
## get low expression = low LFC pairs (positive slope)
d.DeKegel_top5_pairs_drug_sens_A2_expr_lm_pos_slope <-d.DeKegel_top5_pairs_drug_sens_A2_expr_lm %>%
  filter(slope > 0)
summarize_stats(d.DeKegel_top5_pairs_drug_sens_A2_expr_lm_pos_slope)
## df: d.DeKegel_top5_pairs_drug_sens_A2_expr_lm_pos_slope 
## # target/compound pairs with
## p < 0.05: 158 
## p < 0.01: 82 
## FDR < 0.1: 67 
## FDR < 0.05: 39
## get high expression = low LFC pairs (negative slope)
d.DeKegel_top5_pairs_drug_sens_A2_expr_lm_neg_slope <- d.DeKegel_top5_pairs_drug_sens_A2_expr_lm %>%
  filter(slope < 0)
summarize_stats(d.DeKegel_top5_pairs_drug_sens_A2_expr_lm_neg_slope)
## df: d.DeKegel_top5_pairs_drug_sens_A2_expr_lm_neg_slope 
## # target/compound pairs with
## p < 0.05: 182 
## p < 0.01: 74 
## FDR < 0.1: 61 
## FDR < 0.05: 35

Plot A1 top hits

A1_top_hits <- d.DeKegel_top5_pairs_drug_sens_A1_expr_lm_pos_slope %>%
  dplyr::select(target, compound_name, r_squared, fdr) %>%
  distinct(target, compound_name, .keep_all = TRUE) %>%
  arrange(fdr) 

print_kbl(head(A1_top_hits, n = 20))
target compound_name r_squared fdr
FYN_SRC vandetanib 0.0659658 0.0000014
FYN_SRC saracatinib 0.0572548 0.0000035
ATP1A1_ATP1A2 k-strophanthidin 0.0587694 0.0000035
ATP1A1_ATP1A3 k-strophanthidin 0.0587694 0.0000035
ATP1A1_ATP1A4 k-strophanthidin 0.0587694 0.0000035
FYN_YES1 saracatinib 0.0572548 0.0000035
FYN_SRC PD-168393 0.0534604 0.0000095
ARAF_RAF1 AZ-628 0.0376702 0.0004196
MAPK11_MAPK14 tyrphostin-AG-1478 0.0336892 0.0008928
FYN_SRC bosutinib 0.0326309 0.0011196
FYN_YES1 bosutinib 0.0326309 0.0011196
MAP2K1_MAP2K2 arctigenin 0.0301968 0.0024172
CDK2_CDK3 NU6027 0.0258299 0.0066929
CDK2_CDK5 NU6027 0.0258299 0.0066929
EPAS1_HIF1A BAY-87-2243 0.0245436 0.0097530
EPAS1_HIF1A 2-methoxyestradiol 0.0239446 0.0107025
NR1I2_VDR erlotinib 0.0219698 0.0137919
CAMK2D_CAMK2G bosutinib 0.0215397 0.0151139
ESR1_ESR2 ethynodiol-diacetate 0.0209548 0.0180253
GSK3A_GSK3B SB-203580 0.0209268 0.0181552
A1_top_hits_plot_list <- A1_top_hits %>%
  arrange(fdr) %>%
  unite("target_compound_label", c("target", "compound_name"), sep = "\n", remove = FALSE) %>%
  head(n = 9) %>%
  pull(target_compound_label)

make_lm_plot(d.DeKegel_top5_pairs_drug_sens_A1_expr_lm_pos_slope, A1_top_hits_plot_list, A1_log_tpm)

Plot A2 top hits

A2_top_hits <- d.DeKegel_top5_pairs_drug_sens_A2_expr_lm_pos_slope %>%
  dplyr::select(target, compound_name, r_squared, fdr) %>%
  distinct(target, compound_name, .keep_all = TRUE) %>%
  arrange(fdr) 

print_kbl(head(A2_top_hits, n = 20))
target compound_name r_squared fdr
AKT1_AKT3 GSK2110183 0.0982682 0.0000000
AKT2_AKT3 GSK2110183 0.0982682 0.0000000
AKT1_AKT3 MK-2206 0.0873418 0.0000000
AKT2_AKT3 MK-2206 0.0873418 0.0000000
AKT1_AKT3 GDC-0068 0.0607472 0.0000013
AKT2_AKT3 GDC-0068 0.0607472 0.0000013
MAP2K1_MAP2K2 bosutinib 0.0474871 0.0000464
AKT1_AKT3 AZD5363 0.0474116 0.0000464
AKT2_AKT3 AZD5363 0.0474116 0.0000464
ABL1_ABL2 saracatinib 0.0444061 0.0000956
PIK3R1_PIK3R2 CUDC-907 0.0410053 0.0003328
CDK2_CDK5 bosutinib 0.0371005 0.0006195
PTPN11_PTPN6 BVT-948 0.0264080 0.0107209
CDK1_CDK2 NU6027 0.0258299 0.0111549
HDAC4_HDAC5 romidepsin 0.0246434 0.0157450
EPAS1_HIF1A BAY-87-2243 0.0243860 0.0160324
ROCK1_ROCK2 SB-203580 0.0236855 0.0187571
MAP3K2_MAP3K3 bosutinib 0.0233228 0.0195743
AKT1_AKT2 A-674563 0.0221977 0.0259942
PPP3CA_PPP3CB cyclosporin-a 0.0240152 0.0260890
A2_top_hits_plot_list <- A2_top_hits %>%
  arrange(fdr) %>%
  unite("target_compound_label", c("target", "compound_name"), sep = "\n", remove = FALSE) %>%
  head(n = 9) %>%
  pull(target_compound_label)

make_lm_plot(d.DeKegel_top5_pairs_drug_sens_A2_expr_lm_pos_slope, A2_top_hits_plot_list, A2_log_tpm)

Outliers for top lm hits

## A2 top hits
target_list <- c("AKT1_AKT2", "AKT2_AKT3", "AKT1_AKT3", "PTPN11_PTPN6", "CDK2_CDK5", "CDK1_CDK2", "MAP2K1_MAP2K2")
compound_list <- c("GSK2110183", "MK−2206", "GDC−0068", "AZD5363", "BVT_948", "bosutinib", "NU6027", "canertinib")

## robustly scale expression data
d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_unfiltered <- d.DeKegel_top5_pairs_drug_sens_expr %>%
  filter(target %in% target_list) %>%
  filter(grepl(str_c(compound_list, collapse = "|"), compound_name)) %>%
  filter(!is.na(A2_log_tpm) | !is.na(dependency)) %>%
  group_by(target, compound_id) %>%
  mutate(scaled_A2_log_tpm = RobScale(A2_log_tpm, center = TRUE, scale = TRUE)) %>%
  mutate(A2_expr_outlier_flag = case_when(
    scaled_A2_log_tpm < -1.5 ~ "low",
    scaled_A2_log_tpm > 1.5 ~ "high",
    TRUE ~ "neither")) %>%
  mutate(scaled_A1_log_tpm = RobScale(A1_log_tpm, center = TRUE, scale = TRUE)) %>%
  mutate(A1_expr_outlier_flag = case_when(
    scaled_A1_log_tpm < -1.5 ~ "low",
    scaled_A1_log_tpm > 1.5 ~ "high",
    TRUE ~ "neither")) %>%
  ungroup() %>%
  mutate(A2_expr_outlier_flag = factor(A2_expr_outlier_flag, levels = c("low", "neither", "high")))%>%
  mutate(A1_expr_outlier_flag = factor(A1_expr_outlier_flag, levels = c("low", "neither", "high")))
d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers <- d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_unfiltered %>%
  group_by(target, compound_id) %>%
  ## get rid of drug/targets that don't have any "low" cell lines
  filter(any(A2_expr_outlier_flag == "low")) %>%
  ungroup() 

d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_grp_med_diff <- d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers %>%
  group_by(target, compound_id, A2_expr_outlier_flag) %>%
  ## calculate median dependency score for each outlier group
  summarize(grp_med_dependency = median(dependency)) %>%
  pivot_wider(names_from = A2_expr_outlier_flag, values_from = grp_med_dependency) %>%
  ## calculate difference between low and neither groups
  mutate(grp_med_diff = low - neither) %>%
  dplyr::select(target, compound_id, grp_med_diff) 

d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers <- d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers %>%
  left_join(d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_grp_med_diff, by = c("target", "compound_id"))

## low outliers and drug sensitivity
d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_wrst_pvals <- d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers %>%
  filter(A2_expr_outlier_flag != "high") %>%
  group_by(target, compound_id) %>%
  summarize(p_val = wilcox.test(dependency ~ A2_expr_outlier_flag)$p.value)

## adjust for multiple testing overall
d.stats <- BH_adjust(d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_wrst_pvals)
## Adding missing grouping variables: `target`
d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_wrst <- d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers %>%
  left_join(d.stats, by = c("target", "compound_id")) 

## filter for hits where low-expressing cell lines are more sensitive to the drug
## (low group has lower median than neither group)
d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_wrst_sens <- d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_wrst %>%
  filter(grp_med_diff < 0)

## filter for hits where low-expressing ell lines are more resistant to the drug
## (low group has higher median than neither group)
d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_wrst_resist <- d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_wrst %>%
  filter(grp_med_diff > 0)

summarize_stats(d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_wrst_sens)
## df: d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_wrst_sens 
## # target/compound pairs with
## p < 0.05: 5 
## p < 0.01: 5 
## FDR < 0.1: 5 
## FDR < 0.05: 5
summarize_stats(d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_wrst_resist)
## df: d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_wrst_resist 
## # target/compound pairs with
## p < 0.05: 0 
## p < 0.01: 0 
## FDR < 0.1: 0 
## FDR < 0.05: 0
Plot
plot_list <- d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_wrst_sens %>%
  unite("target_compound_id", c("target", "compound_name"), sep = "\n", remove = FALSE) %>%
  distinct(target_compound_id) %>%
  pull(target_compound_id)

make_outlier_plot(d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_wrst_sens,
                  plot_list[1:4], "low", A2_expr_outlier_flag, A2_log_tpm)

make_outlier_plot(d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_wrst_sens,
                  plot_list[5:8], "low", A2_expr_outlier_flag, A2_log_tpm)

make_lineage_plot(d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_wrst_sens,
                  plot_list, A2_log_tpm, A2_expr_outlier_flag)

write_rds(d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_wrst_sens, 
          file.path(out_dir, "d.top_outlier_pairs"))